#include <simd.h>
typedef intv8 dmav_t;

#define PE_GET (1 | 0 << 4)
#define PE_PUT (0 | 0 << 4)
#define BCAST_GET (1 | 1 << 4)

#define dmav_init(desc, mode)    \
  asm("sll %1, 24, %0\n\t"       \
      "vinsw %0, $31, 1, %0\n\t" \
      : "=r"(desc)               \
      : "r"(mode))

#define dmav_set_stride(desc, stride, bsize) \
  asm("#zap %3, 0x7f, %3\n\t"                \
      "sll %3, 32, %3\n\t"                   \
      "zap %0, 0x70, %0\n\t"                 \
      "bis %3, %0, %0\n\t"                   \
      "srl %3, 32, %3\n\t"                   \
      "vinsw %2, %0, 3, %0\n\t"              \
      : "+r"(desc)                           \
      : "0"(desc), "r"(stride), "r"(bsize))

#define dmav_clear_stride(desc)  \
  asm("zap %0, 0x70, %0\n\t"     \
      "vinsw $31, %0, 3, %0\n\t" \
      "+r"(desc)                 \
      : "0"(desc))

//vshff 0, 0, 0, 1
#define dmav_set_mask(desc, mask)   \
  asm("vshff $31, %0, 0x01, %0\n\t" \
      "zap %0, 0x08, %0\n\t"        \
      "sll %2, 24, %2\n\t"          \
      "bis %2, %0, %0\n\t"          \
      "vshff %0, %0, 0x01, %0\n\t"  \
      : "+r"(desc)                  \
      : "0"(desc), "r"(mask))

#define dmav_set_reply(desc, reply_addr) \
  asm("vshff $31, %0, 0x01, %0\n\t"      \
      "zap %0, 0x7, %0\n\t"              \
      "bis %0, %2, %0\n\t"               \
      "vshff $31, %0, 0x01, %0\n\t"      \
      : "+r"(desc)                       \
      : "0"(desc), "r"(reply_addr))

#define dmav_set_size(desc, size) \
  asm("vinsw %2, %0, 0, %0\n\t"   \
      : "+r"(desc)                \
      : "0"(desc), "r"(size))

#define dma_vasyn3(desc, mem, ldm, size)         \
  asm("vinsw %3, %0, 0, %0\n\t"                  \
      "vextw %0, 2, %3\n\t"                      \
      "zap %3, 0xfc, %3\n\t"                     \
      "stw $31, 0(%3)\n\t"                       \
      "dma %0, %1, %2\n\t"                       \
      "1:"                                       \
      "vextw %0, 2, %3\n\t"                      \
      "zap %3, 0xfc, %3\n\t"                     \
      "ldw %3, 0(%3)\n\t"                        \
      "beq %3, 1b\n\t"                           \
      "vextw %0, 0, %3\n\t"                      \
      : "r"(desc), "r"(mem), "r"(ldm), "r"(size) \
      : "memory")

#define dma_vasyn2(desc, mem, ldm)    \
  asm("vinsw %2, %0, 4, %0\n\t"       \
      "vextw %0, 2, %2\n\t"           \
      "zap %3, 0xfc, %2\n\t"          \
      "stw $31, 0(%2)\n\t"            \
      "vextw %0, 4, %2\n\t"           \
      "dma %0, %1, %2\n\t"            \
      "1:"                            \
      "vextw %0, 2, %2\n\t"           \
      "zap %3, 0xfc, %2\n\t"          \
      "ldw %3, 0(%2)\n\t"             \
      "beq %3, 1b\n\t"                \
      "vextw %0, 4, %2\n\t"           \
      "vinsw $31, %0, 4, %0\n\t"      \
      : "r"(desc), "r"(mem), "r"(ldm) \
      : "memory")
#define dma_rpl(desc, mem, ldm, reply)             \
  asm("dma %0, %1, %2\n\t"                         \
      :                                            \
      : "r"(desc), "r"(mem), "r"(ldm), "r"(&reply) \
      : "memory")
#define dma_rpl_size(desc, mem, ldm, size, reply)             \
  asm("vinsw %3, %0, 4, %0\n\t"                               \
      "dma %0, %1, %2\n\t"                                    \
      :                                                       \
      : "r"(desc), "r"(mem), "r"(ldm), "r"(size), "r"(&reply) \
      : "memory")

#define dma_syn()               \
  asm("vinsf %0, %1, 1, %1\n\t" \
      "1:\n\t"                  \
      "vextf %1, 1, %0\n\t"     \
      "ldw %0, 0(%0)\n\t"       \
      "subw %0, %1, %0\n\t"     \
      "bne %0, 1b\n\t"          \
      : "r"(&__reply)           \
      : "r"(__count))

#define pe_get(mem, ldm, size) dma_vasyn3(__pe_get_desc, mem, ldm, size)
#define pe_put(mem, ldm, size) dma_vasyn3(__pe_put_desc, mem, ldm, size)
#define bcast_get(mem, ldm, size) dma_vasyn3(__bcast_get_desc, mem, ldm, size)
#define pe_iget(mem, ldm, size) dma_rpl_size(__pe_iget_desc, mem, ldm, size)
#define pe_iput(mem, ldm, size) dma_rpl_size(__pe_iput_desc, mem, ldm, size)
#define bcast_iget(mem, ldm, size) dma_rpl_size(__bcast_iget_desc, mem, ldm, size)

#define dma_init()                                          \
  dmav_t __pe_get_desc, __pe_put_desc, __bcast_get_desc;    \
  dmav_t __pe_iget_desc, __pe_iput_desc, __bcast_iget_desc; \
  volatile int __reply = 0, __count = 0, __reply_syn = 0;   \
  {                                                         \
    dma_init(__pe_get_desc, PE_GET);                        \
    dma_init(__pe_put_desc, PE_PUT);                        \
    dma_init(__bcast_get_desc, BCAST_GET);                  \
    dmav_set_reply(__pe_get_desc, &__reply_syn);            \
    dmav_set_reply(__pe_put_desc, &__reply_syn);            \
    dmav_set_reply(__bcast_get_desc, &__reply_syn);         \
    dma_init(__pe_iget_desc, PE_GET);                       \
    dma_init(__pe_iput_desc, PE_PUT);                       \
    dma_init(__bcast_iget_desc, BCAST_GET);                 \
    dmav_set_reply(__pe_iget_desc, &__reply);               \
    dmav_set_reply(__pe_iput_desc, &__reply);               \
    dmav_set_reply(__bcast_iget_desc, &__reply);            \
  }
